در این سوال معیار فقر را درصد افراد زیر خط فقر در نظر گرفتهام. به وسیلهی آن فقیرترین کشورها را به دست آورده و سپس درصد افراد زیر خط فقر و حقوق روزانه و امید به زندگی آنها را به دست آوردم.
w_data = read.csv('data/WDI_csv/WDIData.csv')
w_series = read.csv('data/WDI_csv/WDISeries.csv')
w_country = read.csv('data/WDI_csv/WDICountry.csv')
pov_indicator = w_data %>% filter(Indicator.Code == 'SI.POV.NAHC')
pov_indicator$poverty_ratio = pov_indicator %>% select(starts_with('X')) %>% apply(., 1, mean, na.rm = TRUE)
poor_countries = pov_indicator %>% arrange(desc(poverty_ratio)) %>% top_n(10)
poor_countries %>% select(Country.Name, poverty_ratio)
## Country.Name poverty_ratio
## 1 Equatorial Guinea 76.8000
## 2 Zimbabwe 72.3000
## 3 Madagascar 71.6000
## 4 Eritrea 69.0000
## 5 Sao Tome and Principe 67.2500
## 6 Guinea-Bissau 67.0000
## 7 Congo, Dem. Rep. 66.6000
## 8 Burundi 66.0000
## 9 Swaziland 66.0000
## 10 Honduras 62.4125
life_indicator = w_data %>% filter((Country.Name %in% poor_countries$Country.Name) & Indicator.Code == 'SP.DYN.LE00.IN')
life_indicator$life_expectancy = life_indicator %>% select(starts_with("X")) %>% apply(., 1, mean, na.rm = TRUE)
life_indicator %>% select(Country.Name, life_expectancy)
## Country.Name life_expectancy
## 1 Burundi 48.65196
## 2 Congo, Dem. Rep. 48.79504
## 3 Equatorial Guinea 47.29447
## 4 Eritrea 50.61509
## 5 Guinea-Bissau 47.95014
## 6 Honduras 62.98398
## 7 Madagascar 52.46847
## 8 Sao Tome and Principe 60.64623
## 9 Swaziland 51.99816
## 10 Zimbabwe 54.07304
salary_indicator = w_data %>% filter((Country.Name %in% poor_countries$Country.Name) & Indicator.Code == 'NY.GNP.PCAP.PP.CD')
salary_indicator$salary = salary_indicator %>% select(starts_with('X')) %>% apply(., 1, mean, na.rm = TRUE)
salary_indicator$salary = salary_indicator$salary / 365
salary_indicator %>% arrange(desc(salary)) %>% hchart('column', hcaes(x = Country.Name, y = salary)) %>% hc_add_theme(hc_theme_monokai())
در سالهای نزدیک ۱۹۹۴ میلادی امید به زندگی کشور روآندا به شدت کاهش یافته که دلیل آن وقوع نسل کشی در کشور روآندا بوده که باعث کشته شدن افراد زیادی شده است.
rwa_life_expectancy = w_data %>% filter(Indicator.Code == 'SP.DYN.LE00.IN' & Country.Code == 'RWA') %>% select(starts_with('X')) %>% t() %>% data.frame() %>% setDT(keep.rownames = TRUE)
rwa_life_expectancy = rwa_life_expectancy[1:57,]
colnames(rwa_life_expectancy)[1] = 'year'
colnames(rwa_life_expectancy)[2] = 'life_expectancy'
life_indicator = w_data %>% filter(Indicator.Code == 'SP.DYN.LE00.IN')
life_indicator = life_indicator[48:264,] %>% melt
colnames(life_indicator)[5] = 'year'
colnames(life_indicator)[6] = 'life_expectancy'
hcboxplot(x = life_indicator$life_expectancy, var = life_indicator$year, outliers = FALSE) %>% hc_chart(type = 'column') %>% hc_add_series(rwa_life_expectancy, 'line', hcaes(x = year, y = life_expectancy), name = 'Rwanda') %>% hc_xAxis(title = list(text = 'Year')) %>% hc_yAxis(title = list(text = 'Life Expectancy')) %>% hc_title(text = 'Life Expectancy per Year') %>% hc_add_theme(hc_theme_monokai())
میتوان نتیجه گرفت که با زیاد شدن امید به زندگی میزان هزینه انجام شده برای سلامت بیشتر بوده است.
health_indicator = w_data %>% filter(Indicator.Code == 'SH.XPD.CHEX.GD.ZS')
health_indicator$average_cost = health_indicator %>% select(starts_with('X')) %>% apply(., 1, mean, na.rm = TRUE)
life_indicator = w_data %>% filter(Indicator.Code == 'SP.DYN.LE00.IN')
life_indicator$life_expectancy = life_indicator %>% select(starts_with('X')) %>% apply(., 1, mean, na.rm = TRUE)
health_life = inner_join(life_indicator %>% select(Country.Name, life_expectancy), health_indicator %>% select(Country.Name, average_cost), by = 'Country.Name')
ggplot(health_life, aes(x = life_expectancy , y = average_cost)) + geom_point(color = 'red') + geom_smooth(method = lm)
بله با توجه به نمودار زیر میتوان این موضوع را فهمید.
iran_indicator = w_data %>% filter(Indicator.Code == 'NY.GNP.PCAP.PP.CD' & Country.Code == 'IRN') %>% select(5:63) %>% t()
iran_indicator = data.frame(iran_indicator)
iran_indicator = setDT(iran_indicator, keep.rownames = TRUE)
iran_indicator = iran_indicator %>% filter(between(row_number(), 31, 59))
colnames(iran_indicator)[1] = 'year'
colnames(iran_indicator)[2] = 'purchasing_power'
hchart(iran_indicator, 'line', hcaes(x = year, y = purchasing_power)) %>% hc_add_theme(hc_theme_monokai())
در نمودار های زیر میتوانید این مقایسه ها را ببینید.
finance_indicators = w_series %>% filter(Series.Code %in% c('FP.CPI.TOTL.ZG', 'NY.GDP.MKTP.KD.ZG', 'NY.GDP.MKTP.KD', 'NY.GDP.FCST.CD', 'BX.GSR.TOTL.CD', 'CM.MKT.TRAD.GD.ZS', 'FR.INR.LNDP', 'CM.MKT.LCAP.GD.ZS', 'FR.INR.DPST', 'NE.GDI.STKB.CD', 'NE.GDI.TOTL.CD', 'NE.IMP.GNFS.ZS', 'NE.TRD.GNFS.ZS', 'NV.IND.MANF.ZS', 'NV.IND.TOTL.ZS', 'NY.TAX.NIND.CD', 'NY.GSR.NFCY.CD', 'TX.VAL.MRCH.WL.CD', 'BX.KLT.DINV.WD.GD.ZS' , 'FM.LBL.BMNY.GD.ZS')) %>% select(Series.Code, Indicator.Name)
lst = list()
sapply(1:20, function(index){
df = w_data %>% filter(Indicator.Code == finance_indicators$Series.Code[index]) %>% filter(between(row_number(), 48, 264)) %>% melt
colnames(df)[5] = 'year'
colnames(df)[6] = 'name'
iran_df = w_data %>% filter(Indicator.Code == finance_indicators$Series.Code[index] & Country.Code == 'IRN') %>% select(starts_with('X')) %>% t() %>% data.frame() %>% setDT(keep.rownames = TRUE) %>% filter(between(row_number(), 1, 57))
colnames(iran_df)[1] = 'year'
colnames(iran_df)[2] = 'name'
lst[[index]] = hcboxplot(x = df$name, var = df$year, outliers = FALSE) %>% hc_chart(type = 'column') %>% print(hc_add_series(name = 'Iran', iran_df, 'line', hcaes(x = year, y = name)) %>% hc_yAxis(title = list(text = finance_indicators$Indicator.Name[index])) %>% hc_xAxis(title = list(text = 'Year')) %>% hc_add_theme(hc_theme_monokai()))
})
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## x List,6 List,6 List,6 List,6 List,6 List,6 List,6 List,6
## width NULL NULL NULL NULL NULL NULL NULL NULL
## height NULL NULL NULL NULL NULL NULL NULL NULL
## sizingPolicy List,6 List,6 List,6 List,6 List,6 List,6 List,6 List,6
## dependencies NULL NULL NULL NULL NULL NULL NULL NULL
## elementId NULL NULL NULL NULL NULL NULL NULL NULL
## preRenderHook NULL NULL NULL NULL NULL NULL NULL NULL
## jsHooks List,0 List,0 List,0 List,0 List,0 List,0 List,0 List,0
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## x List,6 List,6 List,6 List,6 List,6 List,6 List,6 List,6
## width NULL NULL NULL NULL NULL NULL NULL NULL
## height NULL NULL NULL NULL NULL NULL NULL NULL
## sizingPolicy List,6 List,6 List,6 List,6 List,6 List,6 List,6 List,6
## dependencies NULL NULL NULL NULL NULL NULL NULL NULL
## elementId NULL NULL NULL NULL NULL NULL NULL NULL
## preRenderHook NULL NULL NULL NULL NULL NULL NULL NULL
## jsHooks List,0 List,0 List,0 List,0 List,0 List,0 List,0 List,0
## [,17] [,18] [,19] [,20]
## x List,6 List,6 List,6 List,6
## width NULL NULL NULL NULL
## height NULL NULL NULL NULL
## sizingPolicy List,6 List,6 List,6 List,6
## dependencies NULL NULL NULL NULL
## elementId NULL NULL NULL NULL
## preRenderHook NULL NULL NULL NULL
## jsHooks List,0 List,0 List,0 List,0
با توجه به نتایج به دست آمده ایران در دسته اول قرار دارد.
finance_indicators_data = w_data %>% filter(Indicator.Code %in% finance_indicators$Series.Code) %>% filter(between(row_number(), 941, 5280)) %>% melt
colnames(finance_indicators_data)[5] = 'year'
colnames(finance_indicators_data)[6] = 'val'
finance_indicators_data = finance_indicators_data %>% select(Country.Name, Indicator.Name, year, val) %>% reshape(., timevar = 'Indicator.Name', direction = 'wide', idvar = c('Country.Name', 'year')) %>% group_by(Country.Name) %>% summarise_all(funs(mean(., na.rm = TRUE)))
finance_indicators_data[3:22] = data.frame(apply(finance_indicators_data[3:22], 2, function(index){
index = as.numeric(as.character(index))
index[is.na(index)] = mean(index, na.rm = TRUE)
index
}))
Countries = finance_indicators_data$Country.Name
finance_indicators_data = finance_indicators_data[,3:22]
rownames(finance_indicators_data) = Countries
finance_indicators_data = scale(finance_indicators_data)
kmeans_res = kmeans(finance_indicators_data, centers = 3)
kmeans_res$cluster[91]
## Iran, Islamic Rep.
## 2
fviz_cluster(kmeans_res, finance_indicators_data)
بله چون در راستای مولفههای PCA تشکیل شده اند.
pca = prcomp(finance_indicators_data)
fviz_pca_biplot(pca, habillage = as.factor(kmeans_res$cluster))
iran_economy = w_data %>% filter(Indicator.Code == 'NY.GDP.MKTP.KD.ZG' & Country.Code == 'IRN') %>% select(starts_with('X')) %>% t() %>% data.frame()
colnames(iran_economy)[1] = 'economic_grow'
iran_economy[is.na(iran_economy)] = mean(iran_economy$economic_grow, na.rm = TRUE)
x = sapply(2:15, function(year){
regression = data.frame()
for (i in 1:nrow(iran_economy) - year)regression = rbind(regression, iran_economy$economic_grow[i:i + year])
cat(sprintf('%d years used for regression\n', year))
print(mean(summary(lm(regression[,ncol(regression)] ~ ., data = regression))$residuals ^ 2))
return(0)
})
## 2 years used for regression
## [1] 6.730351e-29
## 3 years used for regression
## [1] 6.730351e-29
## 4 years used for regression
## [1] 6.730351e-29
## 5 years used for regression
## [1] 6.730351e-29
## 6 years used for regression
## [1] 6.730351e-29
## 7 years used for regression
## [1] 6.730351e-29
## 8 years used for regression
## [1] 6.730351e-29
## 9 years used for regression
## [1] 6.730351e-29
## 10 years used for regression
## [1] 6.730351e-29
## 11 years used for regression
## [1] 6.730351e-29
## 12 years used for regression
## [1] 6.730351e-29
## 13 years used for regression
## [1] 6.730351e-29
## 14 years used for regression
## [1] 6.730351e-29
## 15 years used for regression
## [1] 6.730351e-29
problem567_generic = function(indicators_code){
indicators = w_series %>% filter(Series.Code %in% indicators_code) %>% select(Series.Code, Indicator.Name)
sapply(1:20, function(index){
df = w_data %>% filter(Indicator.Code == indicators$Series.Code[index]) %>% filter(between(row_number(), 48, 264)) %>% melt
colnames(df)[5] = 'year'
colnames(df)[6] = 'name'
iran_df = w_data %>% filter(Indicator.Code == indicators$Series.Code[index] & Country.Code == 'IRN') %>% select(starts_with('X')) %>% t() %>% data.frame() %>% setDT(keep.rownames = TRUE) %>% filter(between(row_number(), 1, 57))
colnames(iran_df)[1] = 'year'
colnames(iran_df)[2] = 'name'
print(hcboxplot(x = df$name, var = df$year, outliers = FALSE) %>% hc_chart(type = 'column') %>% hc_add_series(name = 'Iran', iran_df, 'line', hcaes(x = year, y = name)) %>% hc_yAxis(title = list(text = indicators$Indicator.Name[index])) %>% hc_xAxis(title = list(text = 'Year')) %>% hc_add_theme(hc_theme_monokai()))
})
indicators_data = w_data %>% filter(Indicator.Code %in% indicators$Series.Code) %>% filter(between(row_number(), 941, 5280)) %>% melt
colnames(indicators_data)[5] = 'year'
colnames(indicators_data)[6] = 'val'
indicators_data = indicators_data %>% select(Country.Name, Indicator.Name, year, val) %>% reshape(., timevar = 'Indicator.Name', direction = 'wide', idvar = c('Country.Name', 'year')) %>% group_by(Country.Name) %>% summarise_all(funs(mean(., na.rm = TRUE)))
indicators_data[3:22] = data.frame(apply(indicators_data[3:22], 2, function(index){
index = as.numeric(as.character(index))
index[is.na(index)] = mean(index, na.rm = TRUE)
index
}))
Countries = indicators_data$Country.Name
indicators_data = indicators_data[,3:22]
rownames(indicators_data) = Countries
indicators_data = scale(indicators_data)
kmeans_res = kmeans(indicators_data, centers = 3)
print(kmeans_res$cluster[91])
print(fviz_cluster(kmeans_res, indicators_data))
pca = prcomp(finance_indicators_data)
print(fviz_pca_biplot(pca, habillage = as.factor(kmeans_res$cluster)))
}
health = problem567_generic(c('SH.DYN.NMRT', 'SH.DTH.MORT', 'SP.DYN.AMRT.FE', 'SP.DYN.AMRT.MA', 'SP.DYN.LE00.FE.IN', 'SP.DYN.LE00.MA.IN','SN.ITK.DFCT', 'SP.DYN.TO65.FE.ZS', 'SP.DYN.TO65.MA.ZS', 'SH.MMR.RISK.ZS', 'SP.POP.BRTH.MF', 'SP.POP.80UP.MA.5Y', 'SP.POP.80UP.FE.5Y', 'SN.ITK.DEFC.ZS', 'SH.ANM.NPRG.ZS','SP.POP.0014.TO', 'SP.POP.GROW', 'SP.POP.DPND', 'SH.DTH.NMRT', 'SH.MED.NUMW.P3'))
## Iran, Islamic Rep.
## 2
edu = problem567_generic(c('SE.XPD.TOTL.GD.ZS', 'SE.XPD.TERT.PC.ZS', 'SE.XPD.PRIM.ZS', 'SE.XPD.MTOT.ZS', 'SE.XPD.CTOT.ZS', 'SE.TER.TCHR.FE.ZS', 'SE.TER.ENRR', 'SE.TER.ENRL.TC.ZS', 'SE.SEC.TCHR', 'SE.SEC.ENRL', 'SE.PRM.UNER.MA.ZS', 'SE.PRM.TENR', 'SE.ADT.1524.LT.ZS', 'SE.COM.DURS', 'SE.PRE.DURS', 'SE.PRM.UNER.FE.ZS', 'SE.PRM.AGES', 'SE.PRM.ENRR.FE', 'SE.PRM.ENRR.MA', 'SE.PRM.GINT.ZS'))
## Iran, Islamic Rep.
## 3
indicators_code <- c('SE.XPD.TOTL.GD.ZS', 'SE.XPD.TERT.PC.ZS', 'SE.XPD.PRIM.ZS', 'SE.XPD.MTOT.ZS', 'SE.XPD.CTOT.ZS', 'SE.TER.TCHR.FE.ZS', 'SE.TER.ENRR', 'SE.TER.ENRL.TC.ZS', 'SE.SEC.TCHR', 'SE.SEC.ENRL', 'SE.PRM.UNER.MA.ZS', 'SE.PRM.TENR', 'SE.ADT.1524.LT.ZS', 'SE.COM.DURS', 'SE.PRE.DURS', 'SE.PRM.UNER.FE.ZS', 'SE.PRM.AGES', 'SE.PRM.ENRR.FE', 'SE.PRM.ENRR.MA', 'SE.PRM.GINT.ZS', 'SH.DYN.NMRT', 'SH.DTH.MORT', 'SP.DYN.AMRT.FE', 'SP.DYN.AMRT.MA', 'SP.DYN.LE00.FE.IN', 'SP.DYN.LE00.MA.IN','SN.ITK.DFCT', 'SP.DYN.TO65.FE.ZS', 'SP.DYN.TO65.MA.ZS', 'SH.MMR.RISK.ZS', 'SP.POP.BRTH.MF', 'SP.POP.80UP.MA.5Y', 'SP.POP.80UP.FE.5Y', 'SN.ITK.DEFC.ZS', 'SH.ANM.NPRG.ZS','SP.POP.0014.TO', 'SP.POP.GROW', 'SP.POP.DPND', 'SH.DTH.NMRT', 'SH.MED.NUMW.P3', 'FP.CPI.TOTL.ZG', 'NY.GDP.MKTP.KD.ZG', 'NY.GDP.MKTP.KD', 'NY.GDP.FCST.CD', 'BX.GSR.TOTL.CD', 'CM.MKT.TRAD.GD.ZS', 'FR.INR.LNDP', 'CM.MKT.LCAP.GD.ZS', 'FR.INR.DPST', 'NE.GDI.STKB.CD', 'NE.GDI.TOTL.CD', 'NE.IMP.GNFS.ZS', 'NE.TRD.GNFS.ZS', 'NV.IND.MANF.ZS', 'NV.IND.TOTL.ZS', 'NY.TAX.NIND.CD', 'NY.GSR.NFCY.CD', 'TX.VAL.MRCH.WL.CD', 'BX.KLT.DINV.WD.GD.ZS' , 'FM.LBL.BMNY.GD.ZS')
indicators = w_data %>% filter(Indicator.Code %in% indicators_code)
indicators = indicators[2821:15840,5:63]
indicators = data.frame(apply(indicators, 2, function(index){
index = as.numeric(as.character(index))
index[is.na(index)] = mean(index, na.rm = TRUE)
index
}))
distances = stats::dist(indicators,method = 'euclidean')
cluster_plot = hclust(distances, method = 'complete')
plot(cluster_plot, method = 'compelete')
در نمودار زیر امید به زندگی مردم ایران نسبت به کل دنیا نشان داده شده است. با توجه به نمودار میتوان فهمید که در دوران جنگ تحمیلی امید به زندگی مردم ایران بسیار کاهش یافته است.
iran_life_expectancy = w_data %>% filter(Indicator.Code == 'SP.DYN.LE00.IN' & Country.Code == 'IRN') %>% select(starts_with('X')) %>% t() %>% data.frame() %>% setDT(keep.rownames = TRUE)
iran_life_expectancy = iran_life_expectancy[1:57,]
colnames(iran_life_expectancy)[1] = 'year'
colnames(iran_life_expectancy)[2] = 'life_expectancy'
life_indicator = w_data %>% filter(Indicator.Code == 'SP.DYN.LE00.IN')
life_indicator = life_indicator[48:264,] %>% melt
colnames(life_indicator)[5] = 'year'
colnames(life_indicator)[6] = 'life_expectancy'
hcboxplot(x = life_indicator$life_expectancy, var = life_indicator$year, outliers = FALSE) %>% hc_chart(type = 'column') %>% hc_add_series(iran_life_expectancy, 'line', hcaes(x = year, y = life_expectancy), name = 'IRAN') %>% hc_xAxis(title = list(text = 'Year')) %>% hc_yAxis(title = list(text = 'Life Expectancy')) %>% hc_title(text = 'Life Expectancy per Year') %>% hc_add_theme(hc_theme_monokai())
در این نمودار هزینههای نظامی ایران به همراه میانگین جهانی آن رسم شده است. با توجه به شکل میتوان فهمید که در دوران جنگ تحمیلی این مقادیر بسیار افزایش یافته است.
iran_military_expenditure = w_data %>% filter(Indicator.Code == 'MS.MIL.XPND.GD.ZS' & Country.Code == 'IRN') %>% select(starts_with('X')) %>% t() %>% data.frame() %>% setDT(keep.rownames = TRUE)
iran_military_expenditure = iran_military_expenditure[1:57,]
colnames(iran_military_expenditure)[1] = 'year'
colnames(iran_military_expenditure)[2] = 'military_expenditure'
military_indicator = w_data %>% filter(Indicator.Code == 'MS.MIL.XPND.GD.ZS')
military_indicator = military_indicator[48:264,] %>% melt
colnames(military_indicator)[5] = 'year'
colnames(military_indicator)[6] = 'military_expenditure'
hcboxplot(x = military_indicator$military_expenditure, var = military_indicator$year, outliers = FALSE) %>% hc_chart(type = 'column') %>% hc_add_series(iran_military_expenditure, 'line', hcaes(x = year, y = military_expenditure), name = 'IRAN') %>% hc_xAxis(title = list(text = 'Year')) %>% hc_yAxis(title = list(text = 'Military expenditure')) %>% hc_title(text = 'Military expenditure per Year') %>% hc_add_theme(hc_theme_monokai())
در این قسمت به پیدا کردن تعداد قتلهای عمدی در هر ۱۰۰۰۰۰ نفر از جمعیت پرداختهایم. بیشترین تعداد قتلها متعلق به کشورهای آمریکای لاتین و جنوبی است که قاچاق مواد مخدر در آنها رایج است.
homicide_indicator = w_data %>% filter(Indicator.Code == 'VC.IHR.PSRC.P5')
homicide_indicator$homicide = homicide_indicator %>% select(starts_with('X')) %>% apply(., 1, mean, na.rm = TRUE)
homicide_indicator = homicide_indicator %>% arrange(desc(homicide)) %>% top_n(10)
homicide_indicator %>% select(Country.Name, homicide)
## Country.Name homicide
## 1 El Salvador 68.44866
## 2 Honduras 63.67554
## 3 Colombia 47.34863
## 4 Jamaica 44.54665
## 5 Iraq 43.36667
## 6 South Africa 42.37494
## 7 Lesotho 39.55000
## 8 Venezuela, RB 39.25473
## 9 Virgin Islands (U.S.) 35.62500
## 10 Guatemala 34.88277